library(tidyverse)
## ── Attaching packages ──────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ─────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(rvest)
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(readr)
library(viridis)
## Loading required package: viridisLite
library(leaflet)
library(patchwork)
library(RColorBrewer)
library(corrplot)
## corrplot 0.84 loaded
knitr::opts_chunk$set(
    echo = TRUE,
    warning = FALSE,
    fig.width = 8, 
  fig.height = 6,
  out.width = "90%"
)
options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d
theme_set(theme_minimal() + theme(legend.position = "bottom",plot.title = element_text(face = "bold")))

load collision data

data_2018 = 
  read_csv("./data/2018data.csv") %>% 
  janitor::clean_names() 
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   TIME = col_time(format = ""),
##   `ZIP CODE` = col_double(),
##   LATITUDE = col_double(),
##   LONGITUDE = col_double(),
##   `NUMBER OF PERSONS INJURED` = col_double(),
##   `NUMBER OF PERSONS KILLED` = col_double(),
##   `NUMBER OF PEDESTRIANS INJURED` = col_double(),
##   `NUMBER OF PEDESTRIANS KILLED` = col_double(),
##   `NUMBER OF CYCLIST INJURED` = col_double(),
##   `NUMBER OF CYCLIST KILLED` = col_double(),
##   `NUMBER OF MOTORIST INJURED` = col_double(),
##   `NUMBER OF MOTORIST KILLED` = col_double(),
##   COLLISION_ID = col_double()
## )
## See spec(...) for full column specifications.
newnames = colnames(data_2018) %>% 
  str_replace("number_of_","") 
names(data_2018) = newnames

tidy_data = 
data_2018 %>% 
  mutate(
    date_complete = date 
  ) %>% 
  separate(date, into = c("month", "day", "year"), sep = "/") %>% 
  separate(time, into = c("hour", "minute"), sep = ":") %>%
  mutate(
    date_paste = as.Date(paste("2018",month,day,sep = "-"))
  ) %>% 
  select(-zip_code, -location, -on_street_name, -cross_street_name, -off_street_name,-collision_id,-year) %>% 
  rename("vehicle_type" = "vehicle_type_code_1",
         "cause" = "contributing_factor_vehicle_1") %>% 
  mutate( day = as.numeric(day),
          month = as.numeric(month),
          hour = as.numeric(hour),
          minute = as.numeric(minute),
          latitude = replace_na(latitude,0),
          vehicle_type = str_to_lower(vehicle_type)
  ) %>%
  filter( latitude != 0) %>% 
  drop_na(borough)

Vehicle type

vehicle_type_data = 
  tidy_data %>% 
  mutate(
    vehicle_type = replace(vehicle_type,str_detect(vehicle_type,"truck"),"truck"),
    vehicle_type = replace(vehicle_type,str_detect(vehicle_type,"sport utility"),"sport utility vehicle"),
    vehicle_type = replace(vehicle_type, vehicle_type %in% c("taxi","passenger vehicle","sedan","truck","sport utility vehicle") == FALSE, "others")) %>% 
  group_by(vehicle_type,hour) %>% 
  summarize(
    n = n()
  )

vehicle_type_plot = 
vehicle_type_data %>% 
  ggplot(aes(x = hour, y = n, color = vehicle_type)) +
  geom_line() +
  labs(
    color = "vehicle type",
    title = "Collisions of Day for Different Vehicles",
    x = list(title = "Hour of Day"),
    y = list(title = "Collisions")
    )
  
ggplotly(vehicle_type_plot)

Top 10 Collision Reasons

# reason_data = 
#   tidy_data %>%
#   group_by(contributing_factor_vehicle_1) %>%
#   summarize(n = n()) %>% 
#   arrange(desc(n)) %>% 
#   head(10)
# reason_data %>% 
#   plot_ly(x = ~reorder(contributing_factor_vehicle_1,desc(n)), y = ~n, color = ~contributing_factor_vehicle_1 ,type = "bar") %>% 
#   layout(
#     title = "The Number of Items Ordered in Each Aisle",
#     xaxis = list(title = "Different Reasons"),
#     yaxis = list(title = "Count")
#     )

df_cause =  
  tidy_data %>% 
  select(borough,cause) %>% 
      filter(!(cause %in% c('', 
                            'Unspecified', 
                            'Driver Inattention/Distraction',
                            'Other Vehicular',
                            'Failure to Yield Right-of-Way'))
             ) 
df_cause_top = 
  df_cause %>%
  group_by(cause) %>%
  summarise(count = n()) %>%
  top_n(10, count) %>% 
  ungroup() %>% 
  arrange(count)

cause_by_boro =  
  df_cause %>% 
  semi_join(., df_cause_top, by = "cause") %>%
  group_by(borough, cause) %>%
  summarise(count = n()) %>% 
  # reorder cause levels for desirable order in bar chart display:
  mutate(
    cause = factor(cause, levels = pull(df_cause_top,cause))
    ) 

cause_by_boro_all = 
  cause_by_boro %>% 
  group_by(borough) %>%
  summarise(total = sum(count))

cause_by_boro = 
  left_join(cause_by_boro, cause_by_boro_all, by = "borough") %>%
  mutate(ratio = count / total) %>%
  select(-count, -total) 

cause_by_boro_plot =
  cause_by_boro %>% 
  ggplot(aes(x = cause, y = ratio, fill = borough)) +
      geom_bar(width = 0.5, stat = 'identity', show.legend = F) +
      coord_flip() +
      facet_grid(.~borough) +
      labs(title = 'Collisions by Causes',
           x = NULL,
           y = NULL) +
  theme(axis.text.x = element_text(angle = 45))
ggplotly(cause_by_boro_plot)

Mapping

data_2018 = tidy_data
map_data = rename(data_2018, long = latitude, lat = longitude) 
pal <- colorNumeric(
  palette = "Accent",
  domain = map_data$persons_injured)
map_data %>% 
  filter(persons_injured > 0) %>% 
  filter(!(lat < "-70" | lat >= "-75")) %>% 
  mutate(
    label = str_c("<b>vehicle type: ", vehicle_type, "</b><br>Persons Injured: ", persons_injured , sep = "") ) %>%
  leaflet() %>% 
  addTiles() %>%
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addLegend("bottomright", pal = pal, values = ~persons_injured,
    title = "Persons Injured",
    opacity = 1
  ) %>% 
  addCircleMarkers(
    ~lat, ~long,
    color = ~pal(persons_injured),
    radius = 0.5,
    popup = ~ label) 
data_2018_seperate = tidy_data 

Association between Day and People Injured or killed or Accident Number

data_day %>%
  filter(type == "accident number") %>% 
  arrange(desc(number))
## # A tibble: 365 x 3
## # Groups:   date [365]
##    date       type            number
##    <date>     <chr>            <dbl>
##  1 2018-11-15 accident number    695
##  2 2018-12-14 accident number    538
##  3 2018-11-09 accident number    532
##  4 2018-03-02 accident number    531
##  5 2018-03-22 accident number    521
##  6 2018-06-29 accident number    515
##  7 2018-09-06 accident number    515
##  8 2018-05-11 accident number    509
##  9 2018-06-14 accident number    509
## 10 2018-09-05 accident number    509
## # … with 355 more rows
plot_day =
  data_day %>% 
  ggplot(aes(x = date,y = number, color = type))+
  geom_line()+
  theme(
        axis.title = element_text(size=14,face="bold"),
        plot.title = element_text(hjust = 0.5,size=14,face="bold"))+
  labs(
   title = "Trend Over the Year",
   x = "Day of the Year",
   y = "Number") +
  annotate("text", x = as.Date("2018-11-15") , y = 695, label = "2018-11-15")

plot_kill_injured_day = ggplotly(plot_day)
plot_kill_injured_day

proportion of accident and injured people by borough, by hour

boro_events = 
  tidy_data %>% 
  drop_na(borough, persons_injured, persons_killed) %>%
  group_by(borough) %>%
  summarize(accident = n(),
            injured = sum(persons_injured),
            killed = sum(persons_killed)) %>%
  mutate(prop_accident = round(accident/sum(accident),4),
         prop_injured = round(injured/sum(injured),4),
         prop_killed = round(killed/sum(killed),4)) %>%
  pivot_longer(
    prop_accident:prop_killed,
    values_to = "proportion",
    names_prefix = "prop_",
    names_to = "events",
  ) 

boro_events_plot = 
  boro_events %>% 
  ggplot(aes(x = borough, y = proportion, fill = events)) +
  geom_bar(stat = "identity",position = "dodge") +
  labs(title ="proportion of accident and injured people by borough", 
       x = 'Borough',
       y = 'Proportion'
  )

ggplotly(boro_events_plot)

Number of accident and injured people in Brooklyn is greatest comparing with other boroughs. Queen is second. Staten Island have fewest accident and injured people. Manhattan and Bronx is in the middle.

hour_events = 
  tidy_data %>% 
  drop_na(hour, persons_injured, persons_killed) %>%
  group_by(hour) %>%
  summarize(accident = n(),
            injured = sum(persons_injured),
            killed = sum(persons_killed)) %>%
  mutate(prop_accident = round(accident/sum(accident),4),
         prop_injured = round(injured/sum(injured),4),
         prop_killed = round(killed/sum(killed),4)) %>%
  pivot_longer(
    prop_accident:prop_killed,
    values_to = "proportion",
    names_prefix = "prop_",
    names_to = "events",
  ) 

hour_events_plot =
hour_events %>% 
  ggplot(aes(x = hour, y = proportion, color = events)) +
  geom_line() +
  annotate("rect", xmin = 8, xmax = 20, ymin = 0, ymax = Inf, fill = "blue", alpha = .1) +
  labs(
    title = "proportion of accident and injured people by hour", 
    x =  'Hour',
    y = 'Proportion')

hour_events_plot = ggplotly(hour_events_plot)
hour_events_plot[['x']][['layout']][['shapes']] <- c()

hour_events_plot = layout(hour_events_plot, shapes = list(type = "rect", fillcolor = "pink", line = list(color = "pink"), opacity = 0.3, x0 = 8, x1 = 20, xref = "x", y0 = 0, y1 = 0.08, yref = "y"))

hour_events_plot

Frequency of accident and injured people in the period of 8 am-20 pm is higher than other period of time. Started from 3 am, number of accident and injured people increase and reach a small peak at 8am. At 5pm, it reaches a big peak. then it started to decrease.

Holiday Analysis

data_2018 = tidy_data

holiday_data = 
  data_2018 %>%
  select(date_complete, everything()) %>%
  mutate(holiday = "No Holiday") %>% 
  select(date_complete, holiday, everything()) %>% 
  mutate(holiday = replace(holiday, date_complete == "01/01/2018", "New Year's Day"), 
         holiday = replace(holiday, date_complete == "01/15/2018", "Martin Luther King, Jr. Day"),
         holiday = replace(holiday, date_complete == "01/13/2018"|date_complete =="01/14/2018", "Martin Luther King Day Weekend (2 days)"),
         holiday = replace(holiday, date_complete == "02/19/2018", "Presidents Day"),
         holiday = replace(holiday, date_complete == "02/17/2018"|date_complete =="02/18/2018", "Presidents Day Weekend (2 days)"),
         holiday = replace(holiday, date_complete == "05/28/2018", "Memorial Day"),
         holiday = replace(holiday, date_complete == "05/26/2018"|date_complete =="05/27/2018", "Memorial Day Weekend (2 days)"),
         holiday = replace(holiday, date_complete == "07/04/2018", "Independence Day"),
         holiday = replace(holiday, date_complete == "07/07/2018"|date_complete =="07/08/2018", "Independence Day Weekend (2 days)"),
         holiday = replace(holiday, date_complete == "09/03/2018", "Labor Day"),
         holiday = replace(holiday, date_complete == "09/01/2018"|date_complete =="09/02/2018", "Labor Day Weekend (2 days)"),
         holiday = replace(holiday, date_complete == "11/22/2018", "Thanksgiving Day"),
         holiday = replace(holiday, date_complete == "11/23/2018"|date_complete =="11/24/2018"|date_complete =="11/25/2018", "Thanksgiving Weekend (3 days)"),
         holiday = replace(holiday, date_complete == "12/25/2018", "Christmas Day"),
         holiday = replace(holiday, date_complete == "12/26/2018"|date_complete =="12/27/2018", "Christmas Day Weekend (2 days)")
         )
holiday_data =
  holiday_data %>% 
  drop_na(persons_injured)

holiday_data %>% 
  group_by(holiday) %>% 
  summarise(total_injured = sum(persons_injured))
## # A tibble: 16 x 2
##    holiday                                 total_injured
##    <chr>                                           <dbl>
##  1 Christmas Day                                      70
##  2 Christmas Day Weekend (2 days)                    200
##  3 Independence Day                                   72
##  4 Independence Day Weekend (2 days)                 184
##  5 Labor Day                                         111
##  6 Labor Day Weekend (2 days)                        199
##  7 Martin Luther King Day Weekend (2 days)           181
##  8 Martin Luther King, Jr. Day                       106
##  9 Memorial Day                                       93
## 10 Memorial Day Weekend (2 days)                     207
## 11 New Year's Day                                     68
## 12 No Holiday                                      34205
## 13 Presidents Day                                     78
## 14 Presidents Day Weekend (2 days)                   155
## 15 Thanksgiving Day                                   52
## 16 Thanksgiving Weekend (3 days)                     252
bronx = 
  holiday_data %>% 
  filter(borough == "BRONX") %>% 
  group_by(holiday) %>% 
  summarise(bronx_total_injured = sum(persons_injured))
brooklyn = 
  holiday_data %>% 
  filter(borough == "BROOKLYN") %>% 
  group_by(holiday) %>% 
  summarise(brooklyn_total_injured = sum(persons_injured)) %>% 
  select(brooklyn_total_injured)
manhattan = 
  holiday_data %>% 
  filter(borough == "MANHATTAN") %>% 
  group_by(holiday) %>% 
  summarise(manhattan_total_injured = sum(persons_injured))%>% 
  select(manhattan_total_injured)
queens = 
  holiday_data %>% 
  filter(borough == "QUEENS") %>% 
  group_by(holiday) %>% 
  summarise(queens_total_injured = sum(persons_injured))%>% 
  select(queens_total_injured)
staten_island = 
  holiday_data %>% 
  filter(borough == "STATEN ISLAND") %>% 
  group_by(holiday) %>% 
  summarise(staten_island_total_injured = sum(persons_injured))%>% 
  select(staten_island_total_injured)

holiday_borough = 
  cbind(bronx, brooklyn, manhattan, queens, staten_island) %>% 
  mutate(total = bronx_total_injured+brooklyn_total_injured+manhattan_total_injured+    queens_total_injured+staten_island_total_injured) %>% 
  filter(holiday!="No Holiday")

knitr::kable(holiday_borough)
holiday bronx_total_injured brooklyn_total_injured manhattan_total_injured queens_total_injured staten_island_total_injured total
Christmas Day 10 25 5 24 6 70
Christmas Day Weekend (2 days) 34 99 23 41 3 200
Independence Day 18 32 3 18 1 72
Independence Day Weekend (2 days) 26 60 38 54 6 184
Labor Day 3 60 13 32 3 111
Labor Day Weekend (2 days) 31 89 10 62 7 199
Martin Luther King Day Weekend (2 days) 23 58 40 53 7 181
Martin Luther King, Jr. Day 27 42 14 19 4 106
Memorial Day 16 38 12 24 3 93
Memorial Day Weekend (2 days) 36 70 27 67 7 207
New Year’s Day 10 34 9 14 1 68
Presidents Day 5 32 8 32 1 78
Presidents Day Weekend (2 days) 30 42 28 51 4 155
Thanksgiving Day 13 17 5 17 0 52
Thanksgiving Weekend (3 days) 42 74 39 93 4 252
holiday_borough_day = 
  holiday_borough %>% 
  filter(holiday =="Christmas Day"|holiday =="Independence Day"|holiday =="Labor Day"|holiday =="Martin Luther King, Jr. Day"|holiday =="Memorial Day"|holiday =="New Year’s Day"|holiday =="Thanksgiving Day"|holiday =="Presidents Day")

knitr::kable(holiday_borough_day)
holiday bronx_total_injured brooklyn_total_injured manhattan_total_injured queens_total_injured staten_island_total_injured total
Christmas Day 10 25 5 24 6 70
Independence Day 18 32 3 18 1 72
Labor Day 3 60 13 32 3 111
Martin Luther King, Jr. Day 27 42 14 19 4 106
Memorial Day 16 38 12 24 3 93
Presidents Day 5 32 8 32 1 78
Thanksgiving Day 13 17 5 17 0 52
holiday_borough_day = holiday_borough_day[order(holiday_borough_day$total),]

# On the day of the holiday

plot_day =
pivot_longer(
    holiday_borough_day, 
    bronx_total_injured:staten_island_total_injured,
    names_to = "borough", 
    values_to = "person_injured") %>% 
  ggplot(aes(x = reorder(holiday, -total), y = person_injured, fill = borough)) +
  geom_bar(stat="identity") +
  theme(legend.position = "right", axis.text.x = element_text(angle = 45)) +
  labs(
    title = "Persons Injured by Car Crashs \n on Federal Holidays (2018)",
    x = "Holiday",
    y = "# Persons Injured") +
  scale_fill_discrete(name = "Borough", labels = c("Bronx", "Brooklyn", "Manhattan", "Queens", "Staten Island"))

# scale_fill_discrete(name = "Borough", labels = c("Bronx", "Brooklyn", "Manhattan", "Queens", "Staten Island")) 

On three day weekends

data_2018 = tidy_data

holiday_data_weekend = 
  data_2018 %>%
  select(date_complete, everything()) %>%
  mutate(holiday = "No Holiday") %>% 
  select(date_complete, holiday, everything()) %>% 
  mutate(holiday = replace(holiday, date_complete == "01/15/2018"|date_complete == "01/13/2018"|date_complete =="01/14/2018", "Martin Luther King Day Weekend (3 days)"),
         holiday = replace(holiday, date_complete == "02/17/2018"|date_complete == "02/19/2018"|date_complete =="02/18/2018", "Presidents Day Weekend (3 days)"),
         holiday = replace(holiday, date_complete == "05/28/2018"|date_complete == "05/26/2018"|date_complete =="05/27/2018", "Memorial Day Weekend (3 days)"),
         holiday = replace(holiday, date_complete == "07/04/2018"|date_complete == "07/07/2018"|date_complete =="07/08/2018", "Independence Day Weekend (3 days)"),
         holiday = replace(holiday, date_complete == "09/03/2018"|date_complete == "09/01/2018"|date_complete =="09/02/2018", "Labor Day Weekend (3 days)"),
         holiday = replace(holiday, date_complete == "11/22/2018"|date_complete == "11/23/2018"|date_complete =="11/24/2018"|date_complete =="11/25/2018", "Thanksgiving Weekend (4 days)"),
         holiday = replace(holiday, date_complete == "12/25/2018"|date_complete == "12/26/2018"|date_complete =="12/27/2018", "Christmas Day Weekend (3 days)")
         )

holiday_data_weekend =
  holiday_data_weekend %>% 
  drop_na(persons_injured)

holiday_data_weekend %>% 
  group_by(holiday) %>% 
  summarise(total_injured = sum(persons_injured))
## # A tibble: 8 x 2
##   holiday                                 total_injured
##   <chr>                                           <dbl>
## 1 Christmas Day Weekend (3 days)                    270
## 2 Independence Day Weekend (3 days)                 256
## 3 Labor Day Weekend (3 days)                        310
## 4 Martin Luther King Day Weekend (3 days)           287
## 5 Memorial Day Weekend (3 days)                     300
## 6 No Holiday                                      34273
## 7 Presidents Day Weekend (3 days)                   233
## 8 Thanksgiving Weekend (4 days)                     304
bronx = 
  holiday_data_weekend %>% 
  filter(borough == "BRONX") %>% 
  group_by(holiday) %>% 
  summarise(bronx_total_injured = sum(persons_injured))
brooklyn = 
  holiday_data_weekend %>% 
  filter(borough == "BROOKLYN") %>% 
  group_by(holiday) %>% 
  summarise(brooklyn_total_injured = sum(persons_injured)) %>% 
  select(brooklyn_total_injured)
manhattan = 
  holiday_data_weekend %>% 
  filter(borough == "MANHATTAN") %>% 
  group_by(holiday) %>% 
  summarise(manhattan_total_injured = sum(persons_injured))%>% 
  select(manhattan_total_injured)
queens = 
  holiday_data_weekend %>% 
  filter(borough == "QUEENS") %>% 
  group_by(holiday) %>% 
  summarise(queens_total_injured = sum(persons_injured))%>% 
  select(queens_total_injured)
staten_island = 
  holiday_data_weekend %>% 
  filter(borough == "STATEN ISLAND") %>% 
  group_by(holiday) %>% 
  summarise(staten_island_total_injured = sum(persons_injured))%>% 
  select(staten_island_total_injured)

holiday_borough_weekend = 
  cbind(bronx, brooklyn, manhattan, queens, staten_island) %>% 
  mutate(total = bronx_total_injured+brooklyn_total_injured+manhattan_total_injured+    queens_total_injured+staten_island_total_injured) %>% 
  filter(holiday!="No Holiday")

knitr::kable(holiday_borough_weekend)
holiday bronx_total_injured brooklyn_total_injured manhattan_total_injured queens_total_injured staten_island_total_injured total
Christmas Day Weekend (3 days) 44 124 28 65 9 270
Independence Day Weekend (3 days) 44 92 41 72 7 256
Labor Day Weekend (3 days) 34 149 23 94 10 310
Martin Luther King Day Weekend (3 days) 50 100 54 72 11 287
Memorial Day Weekend (3 days) 52 108 39 91 10 300
Presidents Day Weekend (3 days) 35 74 36 83 5 233
Thanksgiving Weekend (4 days) 55 91 44 110 4 304
plot_weekend = 
pivot_longer(
    holiday_borough_weekend, 
    bronx_total_injured:staten_island_total_injured,
    names_to = "borough", 
    values_to = "person_injured") %>% 
  ggplot(aes(x = reorder(holiday, -total), y = person_injured, fill = borough)) +
  geom_bar(stat="identity") +
  theme(legend.position = "right", axis.text.x = element_text(angle = 45)) +
  labs(
    title = "Persons Injured by Car Crashs on \n Federal Holidays Weekends (2018)",
    x = "Holiday Weekends",
    y = "# Persons Injured") +
  scale_fill_discrete(name = "Borough", labels = c("Bronx", "Brooklyn", "Manhattan", "Queens", "Staten Island"))

plot_day + plot_weekend

Weather

climate_df = 
  read_csv("./large_data/climate.csv") %>% 
  janitor::clean_names() %>% 
  select(date,hourly_dry_bulb_temperature,hourly_precipitation,daily_weather) %>%
  separate(date, into = c("date", "time"), sep = " ") %>% 
  mutate(
    date = as.Date(date)
  )
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   DATE = col_datetime(format = ""),
##   REPORT_TYPE = col_character(),
##   BackupDirection = col_logical(),
##   BackupDistance = col_logical(),
##   BackupDistanceUnit = col_logical(),
##   BackupElements = col_logical(),
##   BackupElevation = col_logical(),
##   BackupElevationUnit = col_logical(),
##   BackupEquipment = col_logical(),
##   BackupLatitude = col_logical(),
##   BackupLongitude = col_logical(),
##   BackupName = col_logical(),
##   DailyPeakWindDirection = col_character(),
##   DailyPrecipitation = col_character(),
##   DailySnowDepth = col_character(),
##   DailySnowfall = col_character(),
##   DailySustainedWindDirection = col_character(),
##   DailyWeather = col_character(),
##   HeavyFog = col_logical(),
##   HourlyPrecipitation = col_character()
##   # ... with 29 more columns
## )
## See spec(...) for full column specifications.
all_day_weather_df =
  climate_df %>% filter(time == "23:59:00") %>% 
  distinct(date,daily_weather) %>% 
  filter(!duplicated(date)) %>% 
  separate(daily_weather, into = c("weathertype", "other"), sep = 2) %>%
  mutate(
    weathertype = recode(weathertype,
                      "SN" = "snow",
                      "HZ" = "haze",
                      "RA" = "rain",
                      "FG" = "fog",
                      "BR" = "mist"
                      ),
    weathertype = replace_na(weathertype,"sunny")) %>% 
  select(-other)

collision_all = 
tidy_data %>% 
  group_by(date_complete) %>% 
  summarise(
    collision_event = n(),
    injured_event = sum(persons_injured != 0, na.rm = TRUE),
    killed_event = sum(persons_killed != 0, na.rm = TRUE)
  )

collision_weather = 
bind_cols(all_day_weather_df,collision_all) %>% 
  select(-date_complete)

Weather type

weather_type_plot =  
collision_weather %>% 
  group_by(weathertype) %>% 
  summarise(
    count = mean(collision_event)
  ) %>% 
  mutate(
    weathertype = fct_reorder(weathertype,count)
  ) %>% 
  ggplot(aes(x = weathertype, y = count,
             fill = weathertype)) + 
  geom_bar(stat = "identity", width = 0.6) +
  coord_flip()
  
ggplotly(weather_type_plot)

Daily weather

daily_weather_df = 
climate_df %>% 
  separate(time, into = c("hour", "minute","second"), sep = ":") %>% 
  separate(date, into = c("year","month", "day"), sep = "-") %>% 
  separate(hourly_precipitation, into = c("prep","other"), sep = 4) %>% 
  select(-minute,-second,-year,-daily_weather,-other) %>% 
  mutate(
    month = as.numeric(month),
    day = as.numeric(day),
    hour = as.numeric(hour),
    prep = replace_na(as.numeric(prep),0)
  )

daily_weather_final = daily_weather_df[rep(seq_len(nrow(daily_weather_df)), each = 3), ]

daily_collision_data = 
tidy_data %>% 
  select(month,day,hour,persons_injured,persons_killed) %>% 
  group_by(month,day,hour) %>% 
  summarise(
    accident = n(),
    injured = sum(persons_injured,na.rm = TRUE),
    killed = sum(persons_killed,na.rm = TRUE)
  ) %>% 
  pivot_longer(
    accident:killed,
    names_to = "events",
    values_to = "number"
  )

daily_weather_collision = 
  left_join(daily_collision_data, daily_weather_df,
            by = c("month","day","hour")) %>% 
  mutate(
    prep_degree = 
      if_else(prep %in% 0:0.10,"light",
              if_else( prep < 0.3 & prep > 0.11 , "moderate","heavy")),
  prep_degree = factor(prep_degree, levels =  c("heavy","moderate","light")))

daily_weather_plot =
daily_weather_collision %>% 
  ggplot(aes(hour, y = number, color = events)) +
  geom_point(alpha = 0.5) + 
  facet_grid(~prep_degree) +
  labs(
    color = "events"
  )

ggplotly(daily_weather_plot)